# -*- coding: utf-8 -*-
import time, json
import scrapy, re
import urllib.parse


class SohuSpider(scrapy.Spider):
    name = 'shijian_channel'
    base_url = 'https://api.app.btime.com/news/listV4?cname={}&offset=0&refresh_type=1&refresh_total=1&os_type=h5'

    def start_requests(self):
        urls = {
            '403': "%E5%A8%B1%E4%B9%90&cid=7af450a9dcc60c0af56db7421a461c02",
            '430': '%E6%90%9E%E7%AC%91&cid=339af3318afb7adf81788aca69e4a14a',
            '415': '%E7%A7%91%E6%8A%80&cid=ccaa7b3dafbdd70a7a0abeb17902e994',
            '399': '%E6%95%99%E8%82%B2&cid=bbd2bd21e4b5efe896e7faf1a32101fd',
            '412': '%E7%94%9F%E6%B4%BB&cid=4e9715a3087a5cf063107c9132be55f5',
            '408': '%E4%BA%B2%E5%AD%90&cid=0bd9f2755fe97a7faf17a825c0491e12',
            '405': '%E6%83%85%E6%84%9F&cid=8199cf9f2aa82c7ab093eda868cb32bb',
            '400': '%E6%B1%BD%E8%BD%A6&cid=f5fc97507007cf44a70d940311b952d2',
            '410': '%E6%B8%B8%E6%88%8F&cid=66f51223cf086acc4c4fb1f4bd7594cc',
            '409': '%E6%97%B6%E5%B0%9A&cid=963a9c98ca184610c2a3054749eec76f',
            '401': '%E4%BD%93%E8%82%B2&cid=eaabd3750a92632e39431d1197b80acc',
            '396': '%E5%81%A5%E5%BA%B7&cid=6732ac35350bcbbc2a8dda02a08a98dd',
            '407': '%E6%97%85%E6%B8%B8&cid=74b8334b484cbcb5d07e0ba82833e51d',
            '416': '%E6%88%BF%E4%BA%A7&cid=92db0f7330c8f3b412c048a65e6d6f66',
        }
        for cate in urls:
            item = {}
            item['site_classify_id'] = cate
            item['url'] = self.base_url.format(urls[cate])
            yield scrapy.Request(item['url'], meta={'item': item}, )

    def parse(self, response):
        item = response.meta['item']
        cate = item['site_classify_id']
        data_list = json.loads(response.text)
        for data in data_list['data']['data']:
            item = dict()
            item['publish_time'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                 time.localtime(int(data.get('data').get('pdate'))))
            item['url'] = data.get('url')
            item['title'] = data.get('data').get('title')
            item['spider_time'] = time.strftime("%Y-%m-%d %X", time.localtime())
            item['site_id'] = "112"  # 站点分类
            item['site_classify_id'] = cate
            item['author'] = data.get('data').get('source')
            item['browse_count'] = str(float(data.get('data').get('watches').replace('万', '')) * 10000).split('.')[
                0] if '万' in data.get('data').get('watches') else data.get('data').get('watches')
            item['comment_count'] = data.get('data').get('comments') if data.get('data').get('comments') else 0
            yield item
